library(tidyverse)
library(magrittr)
library(plotly)
library(forecast)
library(ggplot2)
Data Loading —-
Working with ilinet only as the table broke by age group doesn’t have State info
ilinet_tbl <- read_csv('input/ILINet.csv') %>%
arrange(REGION, YEAR, WEEK)
Preprocessing —- # Check number of unique values per feature
rr map(ilinet_tbl, ~length(unique(.x)))
Drop irrelevant features
rr ilinet_tbl <- ilinet_tbl[, map_lgl(ilinet_tbl, ~length(unique(.x)) > 2)]
Investigate why WEEK has 53 distinct values (a year has 52 weeks)
rr table(ilinet_tbl$WEEK)
TODO verify what is the week 53 (maybe leap year?), dropping for now
rr ilinet_tbl %<>% filter(WEEK != 53)
Coerce numerical features to numeric (this will introduce NAs where value == ‘X’)
rr ilinet_tbl[, 4:7] <- map(ilinet_tbl[, 4:7], as.numeric)
Check number of NAs per feature
rr ilinet_tbl %>% group_by(REGION) %>% summarise_all(function(x){sum(is.na(x))}) %>% View
NAs are mostly divided between Florida and Virgin Islands, let’s
investigate further
rr ilinet_tbl %>% filter(REGION %in% c(‘Florida’, ‘Virgin Islands’)) %>% group_by(REGION) %>% summarise_all(function(x){sum(is.na(x))/length(x)}) %>% View
100% of Florida info is NA, 60% of Virgin Islands info is NA
TODO decide what to do with these states later, dropping for now
rr ilinet_tbl %<>% filter(!(REGION %in% c(‘Florida’, ‘Virgin Islands’)))
Combine YEAR and WEEK info
Train/Test split —-
Use last 3 weeks as Test Set (as data is released w/ 2 weeks lag + 1 week protection
against data release delay)
rr train <- ilinet_tbl %>% filter(PRETTYDATE < max(PRETTYDATE) - lubridate::weeks(3))
test <- ilinet_tbl %>% filter(PRETTYDATE >= max(PRETTYDATE) - lubridate::weeks(3))
saveRDS(train, 'data/train.rds')
saveRDS(test, 'data/test.rds')
write_csv(train, 'data/train.csv')
write_csv(test, 'data/test.csv')
General overview of some of the regions
plot_ly(train, x = ~REGION, y = ~ILITOTAL, type = "box")
trainSubSet <- filter(train, REGION %in% c('Alabama', 'California','Hawaii','New York','Virginia'))
sp <- ggplot(trainSubSet, aes(x=trainSubSet$PRETTYDATE, y=trainSubSet$ILITOTAL)) + geom_line()
sp + facet_grid(unique(trainSubSet$REGION) ~ .)

trainSubSet <- filter(train, REGION %in% c('New Mexico', 'Louisiana','Montana','Oklahoma','Vermont'))
sp <- ggplot(trainSubSet, aes(x=trainSubSet$PRETTYDATE, y=trainSubSet$ILITOTAL)) + geom_line()
sp + facet_grid(unique(trainSubSet$REGION) ~ .)

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KG1hZ3JpdHRyKQ0KbGlicmFyeShwbG90bHkpDQpsaWJyYXJ5KGZvcmVjYXN0KQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCiMgRGF0YSBMb2FkaW5nIC0tLS0NCiMgV29ya2luZyB3aXRoIGlsaW5ldCBvbmx5IGFzIHRoZSB0YWJsZSBicm9rZSBieSBhZ2UgZ3JvdXAgZG9lc24ndCBoYXZlIFN0YXRlIGluZm8NCg0KYGBge3J9DQppbGluZXRfdGJsICA8LSByZWFkX2NzdignaW5wdXQvSUxJTmV0LmNzdicpICU+JSANCiAgYXJyYW5nZShSRUdJT04sIFlFQVIsIFdFRUspDQpgYGANCg0KUHJlcHJvY2Vzc2luZyAtLS0tDQojIENoZWNrIG51bWJlciBvZiB1bmlxdWUgdmFsdWVzIHBlciBmZWF0dXJlDQoNCmBgYHtyfQ0KbWFwKGlsaW5ldF90YmwsIH5sZW5ndGgodW5pcXVlKC54KSkpDQpgYGANCg0KIyBEcm9wIGlycmVsZXZhbnQgZmVhdHVyZXMNCmBgYHtyfQ0KaWxpbmV0X3RibCA8LSBpbGluZXRfdGJsWywgbWFwX2xnbChpbGluZXRfdGJsLCB+bGVuZ3RoKHVuaXF1ZSgueCkpID4gMildDQpgYGANCg0KIyBJbnZlc3RpZ2F0ZSB3aHkgV0VFSyBoYXMgNTMgZGlzdGluY3QgdmFsdWVzIChhIHllYXIgaGFzIDUyIHdlZWtzKQ0KYGBge3J9DQp0YWJsZShpbGluZXRfdGJsJFdFRUspDQpgYGANCg0KIyBUT0RPIHZlcmlmeSB3aGF0IGlzIHRoZSB3ZWVrIDUzIChtYXliZSBsZWFwIHllYXI/KSwgZHJvcHBpbmcgZm9yIG5vdw0KYGBge3J9DQppbGluZXRfdGJsICU8PiUNCiAgZmlsdGVyKFdFRUsgIT0gNTMpDQpgYGANCg0KIyBDb2VyY2UgbnVtZXJpY2FsIGZlYXR1cmVzIHRvIG51bWVyaWMgKHRoaXMgd2lsbCBpbnRyb2R1Y2UgTkFzIHdoZXJlIHZhbHVlID09ICdYJykNCmBgYHtyfQ0KaWxpbmV0X3RibFssIDQ6N10gPC0gbWFwKGlsaW5ldF90YmxbLCA0OjddLCBhcy5udW1lcmljKQ0KYGBgDQoNCiMgQ2hlY2sgbnVtYmVyIG9mIE5BcyBwZXIgZmVhdHVyZQ0KYGBge3J9DQppbGluZXRfdGJsICU+JSANCiAgZ3JvdXBfYnkoUkVHSU9OKSAlPiUgDQogIHN1bW1hcmlzZV9hbGwoZnVuY3Rpb24oeCl7c3VtKGlzLm5hKHgpKX0pICU+JSANCiAgVmlldw0KYGBgDQojIE5BcyBhcmUgbW9zdGx5IGRpdmlkZWQgYmV0d2VlbiBGbG9yaWRhIGFuZCBWaXJnaW4gSXNsYW5kcywgbGV0J3MNCiMgaW52ZXN0aWdhdGUgZnVydGhlcg0KYGBge3J9DQppbGluZXRfdGJsICU+JSANCiAgZmlsdGVyKFJFR0lPTiAlaW4lIGMoJ0Zsb3JpZGEnLCAnVmlyZ2luIElzbGFuZHMnKSkgJT4lIA0KICBncm91cF9ieShSRUdJT04pICU+JSANCiAgc3VtbWFyaXNlX2FsbChmdW5jdGlvbih4KXtzdW0oaXMubmEoeCkpL2xlbmd0aCh4KX0pICU+JSANCiAgVmlldw0KYGBgDQoNCiMgMTAwJSBvZiBGbG9yaWRhIGluZm8gaXMgTkEsIDYwJSBvZiBWaXJnaW4gSXNsYW5kcyBpbmZvIGlzIE5BDQojIFRPRE8gZGVjaWRlIHdoYXQgdG8gZG8gd2l0aCB0aGVzZSBzdGF0ZXMgbGF0ZXIsIGRyb3BwaW5nIGZvciBub3cNCmBgYHtyfQ0KaWxpbmV0X3RibCAlPD4lIA0KICBmaWx0ZXIoIShSRUdJT04gJWluJSBjKCdGbG9yaWRhJywgJ1ZpcmdpbiBJc2xhbmRzJykpKQ0KYGBgDQojIENvbWJpbmUgWUVBUiBhbmQgV0VFSyBpbmZvDQoNCmBgYHtyfQ0KaWxpbmV0X3RibCAlPD4lDQogIG11dGF0ZSgNCiAgICBZRUFSV0VFSyAgID0gcGFzdGUoWUVBUiwgV0VFSywgc2VwID0gJy0nKSwgDQogICAgUFJFVFRZREFURSA9IGFzLkRhdGUocGFzdGUoWUVBUiwgV0VFSywgMSwgc2VwPSItIiksICIlWS0lVS0ldSIpDQogICkNCmBgYA0KIyBUcmFpbi9UZXN0IHNwbGl0IC0tLS0NCiMgVXNlIGxhc3QgMyB3ZWVrcyBhcyBUZXN0IFNldCAoYXMgZGF0YSBpcyByZWxlYXNlZCB3LyAyIHdlZWtzIGxhZyArIDEgd2VlayBwcm90ZWN0aW9uDQojIGFnYWluc3QgZGF0YSByZWxlYXNlIGRlbGF5KQ0KYGBge3J9DQp0cmFpbiA8LSBpbGluZXRfdGJsICU+JSANCiAgZmlsdGVyKFBSRVRUWURBVEUgPCBtYXgoUFJFVFRZREFURSkgLSBsdWJyaWRhdGU6OndlZWtzKDMpKQ0KDQp0ZXN0IDwtIGlsaW5ldF90YmwgJT4lIA0KICBmaWx0ZXIoUFJFVFRZREFURSA+PSBtYXgoUFJFVFRZREFURSkgLSBsdWJyaWRhdGU6OndlZWtzKDMpKQ0KYGBgDQoNCmBgYHtyfQ0KIHNhdmVSRFModHJhaW4sICdkYXRhL3RyYWluLnJkcycpDQogc2F2ZVJEUyh0ZXN0LCAnZGF0YS90ZXN0LnJkcycpDQogd3JpdGVfY3N2KHRyYWluLCAnZGF0YS90cmFpbi5jc3YnKQ0KIHdyaXRlX2Nzdih0ZXN0LCAnZGF0YS90ZXN0LmNzdicpDQpgYGANCg0KI0dlbmVyYWwgb3ZlcnZpZXcgb2Ygc29tZSBvZiB0aGUgcmVnaW9ucw0KYGBge3J9DQpwbG90X2x5KHRyYWluLCB4ID0gflJFR0lPTiwgeSA9IH5JTElUT1RBTCwgIHR5cGUgPSAiYm94IikNCmBgYA0KDQoNCmBgYHtyfQ0KdHJhaW5TdWJTZXQgPC0gZmlsdGVyKHRyYWluLCBSRUdJT04gJWluJSBjKCdBbGFiYW1hJywgJ0NhbGlmb3JuaWEnLCdIYXdhaWknLCdOZXcgWW9yaycsJ1ZpcmdpbmlhJykpDQpzcCA8LSBnZ3Bsb3QodHJhaW5TdWJTZXQsIGFlcyh4PXRyYWluU3ViU2V0JFBSRVRUWURBVEUsIHk9dHJhaW5TdWJTZXQkSUxJVE9UQUwpKSArIGdlb21fbGluZSgpDQpzcCArIGZhY2V0X2dyaWQodW5pcXVlKHRyYWluU3ViU2V0JFJFR0lPTikgfiAuKQ0KYGBgDQoNCg0KYGBge3J9DQp0cmFpblN1YlNldCA8LSBmaWx0ZXIodHJhaW4sIFJFR0lPTiAlaW4lIGMoJ05ldyBNZXhpY28nLCAnTG91aXNpYW5hJywnTW9udGFuYScsJ09rbGFob21hJywnVmVybW9udCcpKQ0Kc3AgPC0gZ2dwbG90KHRyYWluU3ViU2V0LCBhZXMoeD10cmFpblN1YlNldCRQUkVUVFlEQVRFLCB5PXRyYWluU3ViU2V0JElMSVRPVEFMKSkgKyBnZW9tX2xpbmUoKQ0Kc3AgKyBmYWNldF9ncmlkKHVuaXF1ZSh0cmFpblN1YlNldCRSRUdJT04pIH4gLikNCmBgYA0KDQo=